#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Description : 递归批量读取30W文本
@Time : 2020/1/2 23:41
@Author : sky
@Site :
@File : FileRead.py
@Software: PyCharm
"""

import os
import time


def traversal_dir(root_dir):
    """
    返回指定目录包含的文件或文件夹名字列表
    :param root_dir: 根目录
    :return: 文件（文件夹）名字列表
    """
    for index, file_name in enumerate(os.listdir(root_dir)):
        # 待处理文件名字列表
        child_file_path = os.path.join(root_dir, file_name)

        if os.path.isfile(child_file_path):
            # 对文件进行操作
            if index % 10000 == 0:
                print('{c} *** {t} *** {i} \t docs has been read'
                      .format(c=root_dir, i=index, t=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))
        elif os.path.isdir(child_file_path):
            # 递归遍历文件目录
            traversal_dir(child_file_path)


if __name__ == '__main__':
    root_dir = '../dataset/CSCMNews'
    start_time = time.time()
    traversal_dir(root_dir)
    end_time = time.time()

    print('Total Cost Time %.2f' % (end_time - start_time) + 's')
